################################################################################################
# This script takes the clean SV data and calculates the individual probability of the BV 
# being cast on the issue it was cast on. The following rules are tested:
# 
# - P_Max: Cast BV on issue with max points 
# - P_Sal: Cast BV the most salient election (i.e. Immigration).
# - P_Educ: Cast a BV on the education-related proposal with most points (Teachers or Bilingual)
# - P_Rand: Cast a BV randomly. Expressed tacitly as P_Rand = 1 - [P_Max + P_Sal + P_Educ]
# 
# Creates columns based on these and then uses MLE to estimate the values using original data
# and boostrapped data (sampling w/ replacement) to form a new sample of the same size.
# 
# All data is the saved to a dataframe, SV_ParamEstimates, and exported to the folder
# SV & QV Simulations (No Recalibration)/SV - Raw Simulation Data/
# 
# Author: Luis S.
# Last Modified: 8.10.17
################################################################################################

library(dplyr)
# library(nloptr) # has method of moving asymptotes (mma fn with constraints)
# library(dfoptim) # uses arctan with derivative free algorithms, e.g. Nelder mead, to get box constraints  

# setwd("C:/Users/Luis/Dropbox/Research/SV vs QV California Experiment/Experiment Analysis & Results/")

# Creates dataframe with original parameters and the bootstrapped data
bootstrapIterations <- 10000

SV_ParamEstimates <- data.frame(Round = seq(0,bootstrapIterations,1), 
                                SV1_P_Max = numeric(bootstrapIterations+1), SV1_P_Educ = numeric(bootstrapIterations+1), 
                                SV1_P_Sal = numeric(bootstrapIterations+1), SV1_P_Rand = numeric(bootstrapIterations+1), 
                                SV2_P_Max = numeric(bootstrapIterations+1), SV2_P_Educ = numeric(bootstrapIterations+1), 
                                SV2_P_Sal = numeric(bootstrapIterations+1), SV2_P_Rand = numeric(bootstrapIterations+1))

SV_data <- read.csv("California Stage 2 - SV & QV Data/Clean Data/California_MTurk_SV_Stage_2_clean&Recoded_AllPos.csv", header = T, stringsAsFactors = F)
SV_data <- select(SV_data, -matches("Abstain")) # removes some summary stuff created previously

####################################
#### SV Strat MLE Calculator Fn ####
###################################%

estimateParams <- function(sampleData,simIndex,SV_ParamEstimates)
{
  # sampleData <- SV_data
  # simIndex <- 1
  
  #--- Statistical Model Specification
  statModel_threeParam <- function(par, data_pmax, data_psal, data_peduc)
  {
    log.likelihood <- 0
    
    Pmax <- par[1] # is the estimate for P_Max
    Psal <- par[2] # is the estimate for P_Sal
    Peduc <- par[3] # is the estimate for P_Educ
    
    for(item in 1:length(data_pmax))
    {
      
      currentLikelihood <- log(data_pmax[item]*Pmax + data_psal[item]*Psal + data_peduc[item]*Peduc + .25*(1 - Pmax - Psal - Peduc))
      
      log.likelihood <- log.likelihood + currentLikelihood
    }
    
    return(-log.likelihood)
    
  }
  
  #--- iterate through SV1 and SV2
  
  sampleData <- sampleData %>% 
    mutate(MaxPoints = pmax(EducationPref, TeachersPref, BondsPref, ImmigrationPref), EducMaxPoints = pmax(EducationPref, TeachersPref)) 
  
  for(voteIndex in c("SV1","SV2")) 
  {
    # voteIndex <- "SV1"
    voteCategory <- paste(voteIndex,"Vote",sep="_")
    # create the dataframe to calculate coefficient variables
    sampleData <- sampleData %>% mutate(P_Max = NA, P_Sal = NA, P_Educ = NA, P_Rand = .25, MaxPointsCount = 0, EducMaxPointsCount = 0)
    
    # iterate through responses and calculate coeff vars
    for(row in 1:nrow(sampleData))
    {
      # Calculate the Max number of points assigned to any proposal & count how many proposals share this number
      if(sampleData$EducationPref[row] == sampleData$MaxPoints[row]) {sampleData$MaxPointsCount[row] <- sampleData$MaxPointsCount[row] + 1} 
      if(sampleData$TeachersPref[row] == sampleData$MaxPoints[row]) {sampleData$MaxPointsCount[row] <- sampleData$MaxPointsCount[row] + 1}
      if(sampleData$ImmigrationPref[row] == sampleData$MaxPoints[row]) {sampleData$MaxPointsCount[row] <- sampleData$MaxPointsCount[row] + 1} 
      if(sampleData$BondsPref[row] == sampleData$MaxPoints[row]) {sampleData$MaxPointsCount[row] <- sampleData$MaxPointsCount[row] + 1}
      
      # Calculate the Max number of points assigned to the education proposals & count how many share this number
      sampleData$EducMaxPointsCount[row] <- ifelse((sampleData$EducationPref[row] == sampleData$EducMaxPoints[row]) & 
                                                     (sampleData$TeachersPref[row] == sampleData$EducMaxPoints[row]),2,1)
      
      # Calculating P_Sal
      sampleData$P_Sal[row] <- ifelse(sampleData[[voteCategory]][row] == "Immigration", 1, 0)
      
      # Calculating P_Max
      if((sampleData[[voteCategory]][row] == "BilingualEduc") & (sampleData$EducationPref[row] == sampleData$MaxPoints[row]))  
      {
        sampleData$P_Max[row] <- 1/sampleData$MaxPointsCount[row]
      } else if((sampleData[[voteCategory]][row] == "TeacherTenure") & (sampleData$TeachersPref[row] == sampleData$MaxPoints[row]))  
      {
        sampleData$P_Max[row] <- 1/sampleData$MaxPointsCount[row]
      } else if((sampleData[[voteCategory]][row] == "PublicBonds") & (sampleData$BondsPref[row] == sampleData$MaxPoints[row]))  
      {
        sampleData$P_Max[row] <- 1/sampleData$MaxPointsCount[row]
      } else if((sampleData[[voteCategory]][row] == "Immigration") & (sampleData$ImmigrationPref[row] == sampleData$MaxPoints[row]))  
      {
        sampleData$P_Max[row] <- 1/sampleData$MaxPointsCount[row]
      } else {sampleData$P_Max[row] <- 0}
      
      # Calculating P_Educ
      if((sampleData[[voteCategory]][row] == "BilingualEduc") | (sampleData[[voteCategory]][row] == "TeacherTenure"))  
      {
        if((sampleData[[voteCategory]][row] == "BilingualEduc") & (sampleData$EducationPref[row] == sampleData$EducMaxPoints[row]))  
        {
          sampleData$P_Educ[row] <- 1/sampleData$EducMaxPointsCount[row]
        } else if((sampleData[[voteCategory]][row] == "TeacherTenure") & (sampleData$TeachersPref[row] == sampleData$EducMaxPoints[row]))  
        {
          sampleData$P_Educ[row] <- 1/sampleData$EducMaxPointsCount[row]
        } else {sampleData$P_Educ[row] <- 0}
      } else {sampleData$P_Educ[row] <- 0}
    }
    
    # get estimates 
    SampleParamMLE <- NA
    startingParams <- c(.35,.3,.3)
    startingParams <- c(.6,.15,.05)
    while(is.na(SampleParamMLE))
    {
      try(SampleParamMLE <- optim(par = startingParams, fn = statModel_threeParam, data_pmax = sampleData$P_Max, 
                                  data_psal = sampleData$P_Sal, data_peduc = sampleData$P_Educ, 
                                  lower = c(0,0,0), upper = c(1,1,1), method = "L-BFGS-B")
        ,silent = T)
      
      startingParams <- c(runif(1,.2,.7), runif(1,.2,.5),runif(1,.2,.4))
    }
    
    SV_ParamEstimates[[paste(voteIndex,"P_Max", sep="_")]][simIndex] <<- round(SampleParamMLE$par[1], digits = 3)
    SV_ParamEstimates[[paste(voteIndex,"P_Sal", sep="_")]][simIndex] <<- round(SampleParamMLE$par[2], digits = 3)
    SV_ParamEstimates[[paste(voteIndex,"P_Educ", sep="_")]][simIndex] <<- round(SampleParamMLE$par[3], digits = 3)
  }
}


#####################################
#### SV Bootstrapped Param Calc. ####
#####################################

set.seed(7)
sampleSize <- nrow(SV_data)

# This loop bootstraps a new sample for SV of the same size using sampling w/ replacement
for(bsCount in 1:nrow(SV_ParamEstimates)) 
{
  print(bsCount-1)
  
  # Select a random sample of rows from the original dataset and create the bootstapped dataset
  randRowNum <- sample(1:sampleSize, sampleSize, replace = T)
  BS_data <- SV_data[randRowNum,]
  
  # Number the rows to avoid conflicts
  row.names(BS_data) <- seq(1,nrow(BS_data),1)
  
  # for the first iteration, calculate the parameters using the original data
  if(bsCount == 1){BS_data <- SV_data}
  
  # get estimates
  estimateParams(BS_data,bsCount,SV_ParamEstimates)
}

SV_ParamEstimates <- mutate(SV_ParamEstimates, SV1_P_Rand = 1 - (SV1_P_Max + SV1_P_Sal + SV1_P_Educ),
                            SV2_P_Rand = 1 - (SV2_P_Max + SV2_P_Sal + SV2_P_Educ))

# write.csv(SV_ParamEstimates, "SV & QV Simulations (No Recalibration)/SV - Raw Simulation Data/SV_StratProb_ParamEstimateMLE.csv", row.names = F)
write.csv(SV_ParamEstimates, "Misc - SV & QV Descriptive Model Parameter Estimation/SV_StratProb_ParamEstimateMLE.csv", row.names = F)


#####################################
#### SV Bootstrapped Param Calc. ####
####################################%

# simData <- read.csv("SV_StratProb_ParamEstimateMLE.csv", stringsAsFactors = F, header = T)
# SV_ParamEstimates <- simData

# SV_StatModelData <- summarise(SV_ParamEstimates, SV1_P_Max = mean(SV1_P_Max), SV1_P_Rand = mean(SV1_P_Rand),
#                               SV1_P_Educ = mean(SV1_P_Educ), SV1_P_Sal = mean(SV1_P_Sal), SV2_P_Max = mean(SV2_P_Max), 
#                               SV2_P_Rand = mean(SV2_P_Rand), SV2_P_Educ = mean(SV2_P_Educ), SV2_P_Sal = mean(SV2_P_Sal)) 
# 
# SV_StatModelData <- round(SV_StatModelData, digits = 2)
# 
# variableName <- names(SV_StatModelData)
# SV_StatModelData <- data.frame(VarName = variableName, Mean = as.data.frame(t(SV_StatModelData))$V1)
# SV_StatModelData <- mutate(SV_StatModelData, Upper95CI = NA, Lower95CI = NA)
# 
# lowerbound <- .025*(nrow(SV_ParamEstimates)-1)
# upperbound <- .975*(nrow(SV_ParamEstimates)-1)
# 
# #### SV1 ####
# 
# SV_ParamTemp <- arrange(SV_ParamEstimates, SV1_P_Max)[lowerbound+1:upperbound,]
# SV_StatModelData$Upper95CI[1] <- max(SV_ParamTemp$SV1_P_Max)
# SV_StatModelData$Lower95CI[1] <- min(SV_ParamTemp$SV1_P_Max)
# 
# SV_ParamTemp <- arrange(SV_ParamEstimates, SV1_P_Rand)[lowerbound+1:upperbound,]
# SV_StatModelData$Upper95CI[2] <- max(SV_ParamTemp$SV1_P_Rand)
# SV_StatModelData$Lower95CI[2] <- min(SV_ParamTemp$SV1_P_Rand)
# 
# SV_ParamTemp <- arrange(SV_ParamEstimates, SV1_P_Educ)[lowerbound+1:upperbound,]
# SV_StatModelData$Upper95CI[3] <- max(SV_ParamTemp$SV1_P_Educ)
# SV_StatModelData$Lower95CI[3] <- min(SV_ParamTemp$SV1_P_Educ)
# 
# SV_ParamTemp <- arrange(SV_ParamEstimates, SV1_P_Sal)[lowerbound+1:upperbound,]
# SV_StatModelData$Upper95CI[4] <- max(SV_ParamTemp$SV1_P_Sal)
# SV_StatModelData$Lower95CI[4] <- min(SV_ParamTemp$SV1_P_Sal)
# 
# #### SV2 ####
# 
# SV_ParamTemp <- arrange(SV_ParamEstimates, SV2_P_Max)[lowerbound+1:upperbound,]
# SV_StatModelData$Upper95CI[5] <- max(SV_ParamTemp$SV2_P_Max)
# SV_StatModelData$Lower95CI[5] <- min(SV_ParamTemp$SV2_P_Max)
# 
# SV_ParamTemp <- arrange(SV_ParamEstimates, SV2_P_Rand)[lowerbound+1:upperbound,]
# SV_StatModelData$Upper95CI[6] <- max(SV_ParamTemp$SV2_P_Rand)
# SV_StatModelData$Lower95CI[6] <- min(SV_ParamTemp$SV2_P_Rand)
# 
# SV_ParamTemp <- arrange(SV_ParamEstimates, SV2_P_Educ)[lowerbound+1:upperbound,]
# SV_StatModelData$Upper95CI[7] <- max(SV_ParamTemp$SV2_P_Educ)
# SV_StatModelData$Lower95CI[7] <- min(SV_ParamTemp$SV2_P_Educ)
# 
# SV_ParamTemp <- arrange(SV_ParamEstimates, SV2_P_Sal)[lowerbound+1:upperbound,]
# SV_StatModelData$Upper95CI[8] <- max(SV_ParamTemp$SV2_P_Sal)
# SV_StatModelData$Lower95CI[8] <- min(SV_ParamTemp$SV2_P_Sal)
# 
# write.csv(SV_StatModelData, "SV & QV Simulations (No Recalibration)/SV - Raw Simulation Data/SV_StratProb_ParamEstimateMLE_Summary&95CI.csv", row.names = F)

